In this notebook, we do a bit of exploration on our IPUMSR data.

library(ipumsr)
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(sf)
Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(tidycensus)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
rel_file <- 'nhgis0005_shape/nhgis0005_shapefile_tl2008_us_tract_1940.zip'

cens1940 <- ipumsr::read_nhgis('../data/nhgis0005_csv.zip')
Use of data from NHGIS is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.

indexed 805.00B in  0s, 12.99MB/s
                                                                         
indexed 2.15GB in  0s, 2.15GB/s
                                                                         
Rows: 7467 Columns: 114── Column specification ───────────────────────────────────────────────────
Delimiter: ","
chr   (9): GISJOIN, STATE, STATEA, COUNTY, COUNTYA, PRETRACTA, TRACTA, ...
dbl (105): YEAR, BUB001, BUQ001, BUQ002, BVG001, BVP001, BUH001, BUH002...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cens1940shp <- ipumsr::read_ipums_sf('../data/nhgis0005_shape.zip', 
                                file_select=rel_file) |>
  select(GISJOIN)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(rel_file)

# Now:
data %>% select(all_of(rel_file))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
cens1940 |> merge(cens1940shp, by='GISJOIN')

Note that we have 7,465 resultant shapes. That means we lose two tracts worth of data. Let’s see if these are easily resolvable:

# check what is missing
cens1940 |> filter(!GISJOIN %in% cens1940shp$GISJOIN)

Okay, tragically, that’s not as easy to fix as one would hope. If you investigate these further, there doesn’t seem to be any equivalent tracts in our shapefile. Additionally, these tracts don’t seem to be the product of tract splitting, so we can’t just recombine them. We’ll just have to accept the 2 tract data loss.

# rename_map defined elsewhere, just makes variables human readable
cens1940 <- cens1940 |> merge(cens1940shp, by='GISJOIN')
colnames(cens1940) <- rename_map[colnames(cens1940)]

# TIME TO DO VARIABLE REFACTORING
cens1940 <- cens1940 |>
  mutate(
    whiteP    = round(100 * popWhite / popTotal, 2),
    nonWhiteP = round(100 * popNonwhite / popTotal, 2),
    blackP    = round(100 * negroPopTotal / popTotal, 2),
    noSchool   = maleNoSchool + femaleNoSchool,
    elementary = maleElem1to4 + maleElem5to6 + maleElem7to8 + femaleElem1to4 + femaleElem5to6 + 
      femaleElem7to8,
    highSchool = maleHS1to3 + maleHS4 + femaleHS1to3 + femaleHS4,
    college    = maleCollege1to3 + maleCollege4plus + femaleCollege1to3 + femaleCollege4plus,
    noReportSchool = maleNoSchoolReport + femaleNoSchoolReport,
    medMaleSchoolingYears = maleMedianYears,
    medFemaleSchoolingYears = femaleMedianYears,
    pop14Plus = maleInLabor + maleNotInLabor + femaleInLabor + femaleNotInLabor
  ) |>
  mutate(
    employed = maleEmployed + femaleEmployed,
    employedP = round(100 * (maleEmployed + femaleEmployed) / pop14Plus, 2),
    seekWorkP = round(100 * (maleSeekWork + femaleSeekWork) / pop14Plus, 2),
    notInLaborP = round(100 * (maleNotInLabor + femaleNotInLabor) / pop14Plus, 2),
  ) |>  
  mutate(
    profP = round(100 * (maleProf + femaleProf) / employed, 2),
    semiProfP = round(100 * (maleSemiProf + femaleSemiProf) / employed, 2),
    propP = round(100 * (maleProp + femaleProp) / employed, 2),
    clericalP = round(100 * (maleClerical + femaleClerical) / employed, 2),
    craftsP = round(100 * (maleCrafts + femaleCrafts) / employed, 2),
    operativesP = round(100 * (maleOperatives + femaleOperatives) / employed, 2),
    domesticP = round(100 * (maleDomestic + femaleDomestic) / employed, 2),
    serviceP = round(100 * (maleService + femaleService) / employed, 2),
    laborP = round(100 * (maleLabor + femaleLabor) / 100, 2)
  ) |>
  select(GISJOIN, YEAR, STATE, STATEA, COUNTY, COUNTYA, PRETRACTA, TRACTA, POSTTRCTA, AREANAME, geometry, whiteP, nonWhiteP, blackP, 
         noSchool, elementary, highSchool, college, 
         noReportSchool, medMaleSchoolingYears, 
         medFemaleSchoolingYears, employedP, 
         seekWorkP, notInLaborP, profP, semiProfP, 
         propP, clericalP, craftsP, operativesP, 
         domesticP, serviceP, laborP, fam1Detach, 
         fam1Attach, fam2SideBySide, fam2Other, fam3, 
         fam4, fam1to4WithBiz, fam5to9, fam10to19, 
         fam20plus, otherStruct, underPt51, pt51to75, 
         pt76to1, pt1to1p5, pt1p5to2, pt2plus, 
         noReportRoom, tenUnderPt51, tenPt51to75, tenPt76to1, 
         tenPt1to1p5, tenPt1p5to2, tenPt2plus, tenNoReport, 
         noMajorRepair, majorRepair, repairNoReport, radio, 
         noRadio, radioNoReport, refrigMech, refrigIce, 
         refrigOther, refrigNone, refrigNoReport, heatCentral, 
         heatNoCentral, heatNoReport) 

head(cens1940)
# Read in redlining
redlining <- sf::read_sf('../data/shapes/mappinginequality.gpkg') |>
  select(grade) |>
  st_transform('ESRI:102008')

# Merge tracts on redlining neighborhoods
cens1940rdl <- cens1940shp |> 
  sf::st_transform('ESRI:102008') |>
  sf::st_join(redlining) 

# Get areas and drop geometries
cens1940rdl$area <- cens1940rdl |>
  sf::st_area()

cens1940rdl <- st_drop_geometry(cens1940rdl)

# Do manipulations to get percent overlap by GISJOIN by grade
cens1940rdl <- cens1940rdl |> 
  group_by(GISJOIN, grade) |>
  mutate(totAreaGrade=sum(area)) |>
  select(GISJOIN, grade, totAreaGrade) |>
  unique() |>
  ungroup() |>
  group_by(GISJOIN) |>
  mutate(totArea=sum(totAreaGrade)) |> 
  mutate(propArea=totAreaGrade / totArea) |>
  select(GISJOIN, grade, propArea) |>
  na.exclude()

cens1940rdl <- cens1940rdl |> 
  group_by(GISJOIN) |> 
  mutate(maxPropArea = max(propArea))

# Determine the one with maximal overlap.
# We default to accepting the *highest*
# assigned grade, in case of ties.
# This should only work to weaken observed
# disparities.
cens1940rdl <- cens1940rdl |> filter(
  maxPropArea == propArea
) |>
  dplyr::arrange('GISJOIN', 'grade') |>
  distinct(GISJOIN, .keep_all = TRUE) |>
  select(GISJOIN, grade)

# We lose ~700 observations in the process:
cens1940rdl

With redlining data, we can merge it in and start doing some fun stuff!

# save progress
cens1940 |> sf::st_write('../data/shapes/cens1940shapes.shp', append=FALSE)
Warning: Field names abbreviated for ESRI Shapefile driver
Deleting layer `cens1940shapes' using driver `ESRI Shapefile'
Writing layer `cens1940shapes' to data source 
  `../data/shapes/cens1940shapes.shp' using driver `ESRI Shapefile'
Writing 6856 features with 72 fields and geometry type Multi Polygon.

Let’s filter down to the city’s of interest for our propensity matching analysis:

Cool, let’s do some exploration!

---
title: "R Notebook"
output: html_notebook
---

In this notebook, we do a bit of exploration on our IPUMSR data.

```{r libraries}
library(ipumsr)
library(dplyr)
library(sf)
library(tidycensus)
```

```{r}
rel_file <- 'nhgis0005_shape/nhgis0005_shapefile_tl2008_us_tract_1940.zip'

cens1940 <- ipumsr::read_nhgis('../data/nhgis0005_csv.zip')
cens1940shp <- ipumsr::read_ipums_sf('../data/nhgis0005_shape.zip', 
                                file_select=rel_file) |>
  select(GISJOIN)

cens1940 |> merge(cens1940shp, by='GISJOIN')
```

Note that we have 7,465 resultant shapes. That means we lose two tracts worth of data. Let's see if these are easily resolvable:

```{r}
# check what is missing
cens1940 |> filter(!GISJOIN %in% cens1940shp$GISJOIN)
```

Okay, tragically, that's not as easy to fix as one would hope. If you investigate these further, there doesn't seem to be any equivalent tracts in our shapefile. Additionally, these tracts don't seem to be the product of tract splitting, so we can't just recombine them. We'll just have to accept the 2 tract data loss.

```{r rename map, include=FALSE}
rename_map <- c(
  # Metadata we don't want to rename
  'GISJOIN'='GISJOIN',   'YEAR'='YEAR',
  'STATE'='STATE',       'STATEA'='STATEA',
  'COUNTY'='COUNTY',     'COUNTYA'='COUNTYA',
  'PRETRACTA'='PRETRACTA', 'TRACTA'='TRACTA',   
  'POSTTRCTA'='POSTTRCTA', 'AREANAME'='AREANAME',
  'geometry'='geometry',
  # NT1: Population 
  'BUB001' = "popTotal",
  # NT2: Population by Race
  'BUQ001' = "popWhite", 'BUQ002' = "popNonwhite",
  # NT4: Negro Population
  'BVG001' = "negroPopTotal",
  # NT5: Occupied Dwelling Units
  'BVP001' = "occDwellingUnitsTotal",
  # NT15: Persons 25 Years and Over by Sex and Years of School Completed
  'BUH001' = "maleNoSchool",       'BUH002' = "maleElem1to4",
  'BUH003' = "maleElem5to6",       'BUH004' = "maleElem7to8",
  'BUH005' = "maleHS1to3",         'BUH006' = "maleHS4",
  'BUH007' = "maleCollege1to3",    'BUH008' = "maleCollege4plus",
  'BUH009' = "maleNoSchoolReport",       'BUH010' = "femaleNoSchool",
  'BUH011' = "femaleElem1to4",     'BUH012' = "femaleElem5to6",
  'BUH013' = "femaleElem7to8",     'BUH014' = "femaleHS1to3",
  'BUH015' = "femaleHS4",          'BUH016' = "femaleCollege1to3",
  'BUH017' = "femaleCollege4plus", 'BUH018' = "femaleNoSchoolReport",
  # NT16: Persons 25 Years and Over by Sex by Median Years of School Completed
  'BUI001' = "maleMedianYears", 'BUI002' = "femaleMedianYears",
  # NT21: Sex by Labor Force Status [Persons 14 Years and Over]
  'BUW001' = "maleInLabor",   'BUW002' = "maleNotInLabor",
  'BUW003' = "femaleInLabor", 'BUW004' = "femaleNotInLabor",
  # NT22: Sex by Detailed Labor Force Status [Persons 14 Years and Over]
  'BUX001' = "maleEmployed",           'BUX002' = "malePubEmergWork",
  'BUX003' = "maleSeekWork",           'BUX004' = "maleNotInLaborHouse",
  'BUX005' = "maleNotInLaborSchool",   'BUX006' = "maleNotInLaborUnable",
  'BUX007' = "maleNotInLaborInst",     'BUX008' = "maleNotInLaborOther",
  'BUX009' = "femaleEmployed",         'BUX010' = "femalePubEmergWork",
  'BUX011' = "femaleSeekWork",         'BUX012' = "femaleNotInLaborHouse",
  'BUX013' = "femaleNotInLaborSchool", 'BUX014' = "femaleNotInLaborUnable",
  'BUX015' = "femaleNotInLaborInst",   'BUX016' = "femaleNotInLaborOther",
  # NT25: Population by Sex by Occupational Group [Employed Persons]
  'BU0001' = "maleProf",       'BU0002' = "maleSemiProf",
  'BU0003' = "maleProp",       'BU0004' = "maleClerical",
  'BU0005' = "maleCrafts",     'BU0006' = "maleOperatives",
  'BU0007' = "maleDomestic",   'BU0008' = "maleService",
  'BU0009' = "maleLabor",      'BU0010' = "maleNoOccReport",
  'BU0011' = "femaleProf",     'BU0012' = "femaleSemiProf",
  'BU0013' = "femaleProp",     'BU0014' = "femaleClerical",
  'BU0015' = "femaleCrafts",   'BU0016' = "femaleOperatives",
  'BU0017' = "femaleDomestic", 'BU0018' = "femaleService",
  'BU0019' = "femaleLabor",    'BU0020' = "femaleNoOccReport",
  # NT30: Dwelling Units by Type of Structure
  'BU6001' = "fam1Detach",     'BU6002' = "fam1Attach",
  'BU6003' = "fam2SideBySide", 'BU6004' = "fam2Other",
  'BU6005' = "fam3",           'BU6006' = "fam4",
  'BU6007' = "fam1to4WithBiz", 'BU6008' = "fam5to9",
  'BU6009' = "fam10to19",      'BU6010' = "fam20plus",
  'BU6011' = "otherStruct", 
  # NT32: Occupied Dwelling Units
  'BU8001' = "underPt51",   'BU8002' = "pt51to75",
  'BU8003' = "pt76to1",     'BU8004' = "pt1to1p5",    
  'BU8005' = "pt1p5to2",    'BU8006' = "pt2plus",     
  'BU8007' = "noReportRoom",
  # NT33: Tenant-Occupied Dwelling Units
  'BU9001' = "tenUnderPt51", 'BU9002' = "tenPt51to75",
  'BU9003' = "tenPt76to1",   'BU9004' = "tenPt1to1p5",
  'BU9005' = "tenPt1p5to2",  'BU9006' = "tenPt2plus",
  'BU9007' = "tenNoReport",
  # NT43: Dwelling Units by State of Repair
  'BVK001' = "noMajorRepair", 'BVK002' = "majorRepair",
  'BVK003' = "repairNoReport",
  # NT45: Occupied Dwelling Units by Radio Ownership
  'BVM001' = "radio", 'BVM002' = "noRadio",
  'BVM003' = "radioNoReport",
  # NT46: Occupied Dwelling Units by Refrigeration
  'BVN001' = "refrigMech",  'BVN002' = "refrigIce",
  'BVN003' = "refrigOther", 'BVN004' = "refrigNone",
  'BVN005' = "refrigNoReport",
  # NT47: Occupied Dwelling Units by Heating
  'BVO001' = "heatCentral", 'BVO002' = "heatNoCentral",
  'BVO003' = "heatNoReport"
)
```

```{r}
# rename_map defined elsewhere, just makes variables human readable
cens1940 <- cens1940 |> merge(cens1940shp, by='GISJOIN')
colnames(cens1940) <- rename_map[colnames(cens1940)]

# TIME TO DO VARIABLE REFACTORING
cens1940 <- cens1940 |>
  mutate(
    whiteP    = round(100 * popWhite / popTotal, 2),
    nonWhiteP = round(100 * popNonwhite / popTotal, 2),
    blackP    = round(100 * negroPopTotal / popTotal, 2),
    noSchool   = maleNoSchool + femaleNoSchool,
    elementary = maleElem1to4 + maleElem5to6 + maleElem7to8 + femaleElem1to4 + femaleElem5to6 + 
      femaleElem7to8,
    highSchool = maleHS1to3 + maleHS4 + femaleHS1to3 + femaleHS4,
    college    = maleCollege1to3 + maleCollege4plus + femaleCollege1to3 + femaleCollege4plus,
    noReportSchool = maleNoSchoolReport + femaleNoSchoolReport,
    medMaleSchoolingYears = maleMedianYears,
    medFemaleSchoolingYears = femaleMedianYears,
    pop14Plus = maleInLabor + maleNotInLabor + femaleInLabor + femaleNotInLabor
  ) |>
  mutate(
    employed = maleEmployed + femaleEmployed,
    employedP = round(100 * (maleEmployed + femaleEmployed) / pop14Plus, 2),
    seekWorkP = round(100 * (maleSeekWork + femaleSeekWork) / pop14Plus, 2),
    notInLaborP = round(100 * (maleNotInLabor + femaleNotInLabor) / pop14Plus, 2),
  ) |>  
  mutate(
    profP = round(100 * (maleProf + femaleProf) / employed, 2),
    semiProfP = round(100 * (maleSemiProf + femaleSemiProf) / employed, 2),
    propP = round(100 * (maleProp + femaleProp) / employed, 2),
    clericalP = round(100 * (maleClerical + femaleClerical) / employed, 2),
    craftsP = round(100 * (maleCrafts + femaleCrafts) / employed, 2),
    operativesP = round(100 * (maleOperatives + femaleOperatives) / employed, 2),
    domesticP = round(100 * (maleDomestic + femaleDomestic) / employed, 2),
    serviceP = round(100 * (maleService + femaleService) / employed, 2),
    laborP = round(100 * (maleLabor + femaleLabor) / 100, 2)
  ) |>
  select(GISJOIN, YEAR, STATE, STATEA, COUNTY, COUNTYA, PRETRACTA, TRACTA, POSTTRCTA, AREANAME, geometry, whiteP, nonWhiteP, blackP, 
         noSchool, elementary, highSchool, college, 
         noReportSchool, medMaleSchoolingYears, 
         medFemaleSchoolingYears, employedP, 
         seekWorkP, notInLaborP, profP, semiProfP, 
         propP, clericalP, craftsP, operativesP, 
         domesticP, serviceP, laborP, fam1Detach, 
         fam1Attach, fam2SideBySide, fam2Other, fam3, 
         fam4, fam1to4WithBiz, fam5to9, fam10to19, 
         fam20plus, otherStruct, underPt51, pt51to75, 
         pt76to1, pt1to1p5, pt1p5to2, pt2plus, 
         noReportRoom, tenUnderPt51, tenPt51to75, tenPt76to1, 
         tenPt1to1p5, tenPt1p5to2, tenPt2plus, tenNoReport, 
         noMajorRepair, majorRepair, repairNoReport, radio, 
         noRadio, radioNoReport, refrigMech, refrigIce, 
         refrigOther, refrigNone, refrigNoReport, heatCentral, 
         heatNoCentral, heatNoReport) 

head(cens1940)
```

```{r get redlining assignments}
# Read in redlining
redlining <- sf::read_sf('../data/shapes/mappinginequality.gpkg') |>
  select(grade) |>
  st_transform('ESRI:102008')

# Merge tracts on redlining neighborhoods
cens1940rdl <- cens1940shp |> 
  sf::st_transform('ESRI:102008') |>
  sf::st_join(redlining) 

# Get areas and drop geometries
cens1940rdl$area <- cens1940rdl |>
  sf::st_area()

cens1940rdl <- st_drop_geometry(cens1940rdl)

# Do manipulations to get percent overlap by GISJOIN by grade
cens1940rdl <- cens1940rdl |> 
  group_by(GISJOIN, grade) |>
  mutate(totAreaGrade=sum(area)) |>
  select(GISJOIN, grade, totAreaGrade) |>
  unique() |>
  ungroup() |>
  group_by(GISJOIN) |>
  mutate(totArea=sum(totAreaGrade)) |> 
  mutate(propArea=totAreaGrade / totArea) |>
  select(GISJOIN, grade, propArea) |>
  na.exclude()

cens1940rdl <- cens1940rdl |> 
  group_by(GISJOIN) |> 
  mutate(maxPropArea = max(propArea))

# Determine the one with maximal overlap.
# We default to accepting the *highest*
# assigned grade, in case of ties.
# This should only work to weaken observed
# disparities.
cens1940rdl <- cens1940rdl |> filter(
  maxPropArea == propArea
) |>
  dplyr::arrange('GISJOIN', 'grade') |>
  distinct(GISJOIN, .keep_all = TRUE) |>
  select(GISJOIN, grade)

# We lose ~700 observations in the process:
cens1940rdl
```

With redlining data, we can merge it in and start doing some fun stuff!

```{r merge in}
cens1940 <- cens1940 |> merge(cens1940rdl, by='GISJOIN', how='inner')

cens1940 <- cens1940 |> filter(grade != 'E')

# save progress
cens1940 |> sf::st_write('../data/shapes/cens1940shapes.shp', append=FALSE)

cens1940 |> head()
```

Let's filter down to the city's of interest for our propensity matching analysis:

```{r}

cities <- c('Chicago', 'philadelphia', 'detroit', "los angeles", 'new york city')

filter_func <- function(place) {
  purrr::map(cities,
    ~ stringr::str_detect(
      place, stringr::regex(.x, ignore_case=T)
    )
  ) |>
  purrr::reduce(c) |>
  any()
}

filter_func('chicago')

mask <- purrr::map(cens1940$AREANAME, ~ filter_func(.x)) |> purrr::reduce(c)
nrow(cens1940[mask,])

```

Cool, let's do some exploration!
```{r}
exclude_list <- c('GISJOIN', 'YEAR', 'STATE', 'STATEA', 'COUNTY', 'COUNTYA', 'PRETRACTA', 'TRACTA', 'POSTTRCTA', 'AREANAME')

analysis_data <- cens1940 |> 
  select(-all_of(exclude_list)) |> 
  st_drop_geometry() |> 
  select(-geometry)

analysis_data

analysis_data |> mlr::summarizeColumns()

```

```{r}
analysis_data_stand <- analysis_data |> mlr::normalizeFeatures(on.constant = "warn")

grade_assigns <- c('A'='AB', 'B'='AB', 'C'='CD', 'D'='CD')

analysis_data_stand$grade <- grade_assigns[analysis_data_stand$grade]

regTask <- mlr::makeClassifTask(data=analysis_data_stand, target='grade')

binClass <- mlr::makeLearner('classif.binomial', predict.type='prob', type='discrete', def='logit', constr='logit')


mlr::getLearnerParamSet(binClass)

mlr::setHyperPars(binClass, type='discrete', def='logit', constr='logit', )
```
































